Prepare TESLA dataset
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
TESLA= fread("TESLA_DATASET_608.csv")
TESLA=TESLA %>% separate(col=PMHC, into = c("HLA_Allele","Peptide"),sep="_")%>% mutate(HLA_Allele = paste0("HLA-",HLA_Allele))%>%
mutate(Length = nchar(Peptide))%>% mutate(VALIDATED = ifelse(VALIDATED==FALSE,"Negative","Positive"))%>% dplyr::rename(Immunogenicity=VALIDATED)
TESLA=TESLA %>% mutate(nM = NETMHC_PAN_BINDING_AFFINITY) %>% mutate("Thalf(h)" = BINDING_STABILITY) %>% mutate(HydroFraction = FRAC_HYDROPHOBIC)
TESLA %>% glimpse()
## Rows: 608
## Columns: 26
## $ HLA_Allele <chr> "HLA-A*01:01", "HLA-A*01:01", "HLA-A*01:01…
## $ Peptide <chr> "ASSFLKSFY", "ATLFSDSWYY", "CSDSGKSFINY", …
## $ PATIENT_ID <int> 3, 3, 3, 2, 3, 2, 3, 3, 3, 3, 2, 2, 2, 3, …
## $ TISSUE_TYPE <chr> "PBMC", "PBMC", "PBMC", "PBMC", "PBMC", "P…
## $ MHC <chr> "A*01:01", "A*01:01", "A*01:01", "A*01:01"…
## $ ALT_EPI_SEQ <chr> "ASSFLKSFY", "ATLFSDSWYY", "CSDSGKSFINY", …
## $ PEP_LEN <int> 9, 10, 11, 9, 9, 10, 9, 9, 8, 9, 8, 9, 10,…
## $ MEASURED_BINDING_AFFINITY <dbl> 45, 33, 77, 0, 3, 1320, NA, 354, 30, 9, NA…
## $ NETMHC_PAN_BINDING_AFFINITY <dbl> 132.1, 251.7, 64.8, 38.7, 24.0, 481.3, 22.…
## $ TUMOR_ABUNDANCE <dbl> 57.819051, 51.857250, 30.389590, 159.96769…
## $ BINDING_STABILITY <dbl> 0.76, 2.20, 2.24, 2.28, 1.21, 2.80, 1.82, …
## $ FRAC_HYDROPHOBIC <dbl> 0.3333333, 0.3000000, 0.2727273, 0.6666667…
## $ AGRETOPICITY <dbl> 0.019019295, 0.838764951, 0.010851108, 1.2…
## $ FOREIGNNESS <dbl> 0.000000e+00, 0.000000e+00, 9.280000e-20, …
## $ MUTATION_POSITION <int> 2, 3, 11, 5, 8, 10, 9, 8, 7, 1, 2, 1, 1, 7…
## $ NUMBER_PREDICTING <int> 11, 10, 8, 13, 13, 8, 5, 12, 6, 8, 6, 16, …
## $ Immunogenicity <chr> "Negative", "Negative", "Negative", "Negat…
## $ TCR_FLOW_I <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ TCR_FLOW_I_QUANT <dbl> 0.0000, 0.0000, 0.0011, 0.0000, 0.0000, 0.…
## $ TCR_NANOPARTICLE <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ TCR_FLOW_II <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ TCR_FLOW_II_QUANT <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, …
## $ Length <int> 9, 10, 11, 9, 9, 10, 9, 9, 8, 9, 8, 9, 10,…
## $ nM <dbl> 132.1, 251.7, 64.8, 38.7, 24.0, 481.3, 22.…
## $ `Thalf(h)` <dbl> 0.76, 2.20, 2.24, 2.28, 1.21, 2.80, 1.82, …
## $ HydroFraction <dbl> 0.3333333, 0.3000000, 0.2727273, 0.6666667…
Prepare pathogenic dataset
Run netMHCpan 4.0
TEST_DATA_LOCATION = "PATHOGENIC_EPITOPES_NETMHCPAN/"
for(allele_i in 1:length(unique(FullDataset$HLA_Allele))){
HLA_ALLELE_FOR_TESTING = gsub(x=unique(FullDataset$HLA_Allele)[allele_i],pattern=":",replacement = "")
LENGTHS=FullDataset %>% filter(HLA_Allele %in% unique(FullDataset$HLA_Allele)[allele_i])%>% pull(Length) %>% unique
testdata=paste0(TEST_DATA_LOCATION,"Allele_",HLA_ALLELE_FOR_TESTING,"_NetMHC_data.txt")
write.table(FullDataset %>% filter(HLA_Allele %in% unique(FullDataset$HLA_Allele)[allele_i]) %>% select(Peptide) %>% pull,file=testdata,sep="\n",col.names = F,row.names = F,quote=F)
# Run model
RESULTS_OUTPUT = paste0(TEST_DATA_LOCATION,"Allele_",HLA_ALLELE_FOR_TESTING,"_NetMHC","_Results.csv")
system(paste0("/Applications/netMHCpan-4.0/netMHCpan -BA -p ",testdata," -a ",unique(FullDataset$HLA_Allele)[allele_i]," -l ",paste0(LENGTHS,collapse = ",")," -xls -xlsfile ", RESULTS_OUTPUT))
}
Read in netMHCpan results
- So to compute nM for each peptide
TEST_DATA_LOCATION = "PATHOGENIC_EPITOPES_NETMHCPAN/"
data_path <- TEST_DATA_LOCATION
files <- dir(data_path, pattern = "NetMHC_Results.csv")
data3 <- data_frame(file = files) %>%
mutate(file_contents = map(file,
~ fread(file.path(data_path, .),skip = 1))
)
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
Netmhcpanres <- unnest(data3)
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(file_contents)`
Netmhcpanres=Netmhcpanres %>% mutate(HLA_Allele = gsub(x=Netmhcpanres$file,pattern="Allele_|_NetMHC_Results.csv",replacement = ""))
Netmhcpanres$HLA_Allele = ClosestMatch2(Netmhcpanres$HLA_Allele,unique(FullDataset$HLA_Allele))
FullDataset=FullDataset %>% inner_join(Netmhcpanres %>% select(! c(file,Pos,ID,core,icore))) %>% mutate(Binder = ifelse(NB==1,"BINDER","NONBINDER"))
## Joining, by = c("Peptide", "HLA_Allele")
FullDataset %>% nrow
## [1] 32698
Filter only for binders for susbequent analysis
- Left with 24924 22717 (5772 and 25% immunogenic)
FullDataset=FullDataset %>% filter(Binder == 'BINDER')
FullDataset %>% nrow
## [1] 22717
FullDataset %>% select(Immunogenicity) %>% table
## .
## Negative Positive
## 16945 5772
FullDataset %>% select(Immunogenicity) %>% table %>% prop.table()
## .
## Negative Positive
## 0.7459172 0.2540828
Run netmhc stabpan
- Generate a binding stability prediction
TEST_DATA_LOCATION = "PATHOGENIC_EPITOPES_STABPAN/"
for(allele_i in 1:length(unique(FullDataset$HLA_Allele))){
HLA_ALLELE_FOR_TESTING = gsub(x=unique(FullDataset$HLA_Allele)[allele_i],pattern=":",replacement = "")
DATASET = FullDataset %>% filter(HLA_Allele %in% unique(FullDataset$HLA_Allele)[allele_i])
for(i in 1:length(unique(DATASET$Length))){
LENGTH = unique(DATASET$Length)[i]
testdata=paste0(TEST_DATA_LOCATION,"LENGTH_",LENGTH,"_Allele_",HLA_ALLELE_FOR_TESTING,"_NetMHC_data.txt")
write.table(DATASET %>% filter(Length == LENGTH) %>% select(Peptide) %>% pull,file=testdata,sep="\n",col.names = F,row.names = F,quote=F)
# Run model
RESULTS_OUTPUT = paste0(TEST_DATA_LOCATION,"LENGTH_",LENGTH,"_Allele_",HLA_ALLELE_FOR_TESTING,"_NetMHC","_Results.csv")
system(paste0("/Applications/netMHCstabpan-1.0/netMHCstabpan -p ",testdata," -a ",unique(FullDataset$HLA_Allele)[allele_i]," -l ",LENGTH," -xls -xlsfile ", RESULTS_OUTPUT))
}
}
Read in predictions from stab pan
TEST_DATA_LOCATION = "PATHOGENIC_EPITOPES_STABPAN/"
data_path <- TEST_DATA_LOCATION
files <- dir(data_path, pattern = "NetMHC_Results.csv")
data3 <- data_frame(file = files) %>%
mutate(file_contents = map(file,
~ fread(file.path(data_path, .),skip = 1))
)
Netmhcpanres <- unnest(data3)
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(file_contents)`
# Extract HLA and clean
Netmhcpanres=Netmhcpanres %>% mutate(HLA_Allele = gsub(x=Netmhcpanres$file,pattern="Allele_|_NetMHC_Results.csv",replacement = ""))
Netmhcpanres=Netmhcpanres %>% mutate(HLA_Allele = gsub(x=Netmhcpanres$HLA_Allele,pattern="LENGTH_[0-9]*_",replacement = ""))
# map HLA nomenclature
Netmhcpanres$HLA_Allele = ClosestMatch2(Netmhcpanres$HLA_Allele,unique(FullDataset$HLA_Allele))
Netmhcpanres=Netmhcpanres %>% select(Peptide, "Thalf(h)",HLA_Allele)
FullDataset=FullDataset %>% inner_join(Netmhcpanres, by=c("Peptide","HLA_Allele"))
# 22717 obs so joining worked correctly.
FullDataset %>% nrow
## [1] 22717
Compute hydrophobic fraction
library(Peptides)
library(stringi)
HYDROPHOBIC_RESIDUES = c("V","I","L","F","M","W","C") # from BARNES 2003 (SEE WELLS PAPER)
FullDataset=FullDataset %>% mutate(HydrophobicCount = stri_count_regex(FullDataset$Peptide, paste0(HYDROPHOBIC_RESIDUES,collapse = "|"))) %>% mutate(HydroFraction = HydrophobicCount/Length)
Finalise TESLA vs Pathogenic data for analysis
TESLA = TESLA %>% mutate(Dataset = "Cancer")
FullDataset = FullDataset%>% mutate(Dataset = "Pathogenic")
TESLASubset=TESLA %>% select(Peptide,Immunogenicity, HLA_Allele, nM, "Thalf(h)", HydroFraction, Dataset)
PathogenicSubset=FullDataset%>% select(Peptide,Immunogenicity, HLA_Allele, nM, "Thalf(h)", HydroFraction, Dataset)
combinedDataset = rbind(TESLASubset,PathogenicSubset)
combinedDataset %>% select(Dataset,Immunogenicity)%>% table
## Immunogenicity
## Dataset Negative Positive
## Cancer 571 37
## Pathogenic 16945 5772
combinedDataset =combinedDataset%>% mutate(Dataset = factor(Dataset, levels = c("Pathogenic","Cancer")))
Results
Affinity comparison between TESLA vs pathogenic data
BA_PATHTESLA_DENSITY_PLT= combinedDataset %>% ggplot(aes(x=nM, fill=Dataset))+facet_wrap(~Immunogenicity)+geom_density(aes(y=..density..),alpha=0.4,position = "identity")+theme_pubr(base_size = 16)+
scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+xlab("Binding Affinity\n(nM)")
mycomparisons =list(c("Positive","Negative"))
MEDIAN_DATA_TESLA_NEG = combinedDataset %>% filter(Dataset == 'Cancer', Immunogenicity == "Negative") %>% dplyr::summarise(median = median(nM))
MEDIAN_DATA_TESLA_POS = combinedDataset %>% filter(Dataset == 'Cancer', Immunogenicity == "Positive") %>% dplyr::summarise(median = median(nM))
BA_PATHTESLA_VIOLIN_PLT=combinedDataset %>% ggviolin(x="Immunogenicity",y="nM",add="boxplot")+theme_pubr(base_size = 16)+facet_grid(~Dataset)+
scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+stat_compare_means(label = "p.signif",label.x.npc = "center")+ylab("Binding Affinity\n(nM)")+ geom_hline(data=MEDIAN_DATA_TESLA_NEG,aes(yintercept=median), linetype="dashed", color = "red", size=0.5)+ geom_hline(data=MEDIAN_DATA_TESLA_POS,aes(yintercept=median), linetype="dashed", color = "green", size=0.5)
BA_PATHTESLA_BOX_PLT=combinedDataset %>% ggplot(aes(x=Immunogenicity, y=nM, fill=Dataset))+
geom_boxplot(alpha=0.3)+stat_compare_means(label = "p.signif",label.x.npc = "center")+
scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+ylab("Binding Affinity\n(nM)")+theme_pubr(base_size = 16)
BA_PATHTESLA_ECDF_PLT=PathogenicSubset %>% select(nM, Immunogenicity, Peptide) %>% mutate(Dataset="Pathogenic")%>%rbind(
TESLASubset %>% select(nM, Immunogenicity,Peptide)%>% mutate(Dataset = "Cancer")
)%>% mutate(Dataset = paste0(Dataset,"_",Immunogenicity))%>% ggplot(aes(x=nM, color=Dataset))+stat_ecdf(size=2)+scale_x_log10()+theme_pubr(base_size = 16)+grids()+ guides(color = guide_legend(nrow = 2))+ylab("Cumulative Freq. of Peptides")+xlab("Binding Affinity\n(nM)")
plot_grid(BA_PATHTESLA_DENSITY_PLT, BA_PATHTESLA_BOX_PLT+theme(legend.position = "none"),BA_PATHTESLA_VIOLIN_PLT, BA_PATHTESLA_ECDF_PLT,nrow=1,align="hv", rel_widths = c(1,0.75,1.05,1.2),axis="bl")

Stability
- Next we look into stability in hrs between groups
STAB_PATHTESLA_DENSITY_PLT=combinedDataset %>% ggplot(aes(x=`Thalf(h)`, fill=Dataset))+facet_wrap(~Immunogenicity)+geom_density(aes(y=..density..),alpha=0.4,position = "identity")+theme_pubr(base_size = 16)+
scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+xlab("Binding Stability\n(hrs)")
MEDIAN_DATA_TESLA_NEG = combinedDataset %>% filter(Dataset == 'Cancer', Immunogenicity == "Negative") %>% dplyr::summarise(median = median(`Thalf(h)`))
MEDIAN_DATA_TESLA_POS = combinedDataset %>% filter(Dataset == 'Cancer', Immunogenicity == "Positive") %>% dplyr::summarise(median = median(`Thalf(h)`))
STAB_PATHTESLA_VIOLIN_PLT=combinedDataset %>% ggviolin(x="Immunogenicity",y="Thalf(h)",add="boxplot")+theme_pubr(base_size = 16)+facet_grid(~Dataset)+stat_compare_means(label = "p.signif",label.x.npc = "center")+
scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+ylab("Binding Stability\n(hrs)")+ geom_hline(data=MEDIAN_DATA_TESLA_NEG,aes(yintercept=median), linetype="dashed", color = "red", size=0.5)+ geom_hline(data=MEDIAN_DATA_TESLA_POS,aes(yintercept=median), linetype="dashed", color = "green", size=0.5)
STAB_PATHTESLA_BOX_PLT=combinedDataset %>% ggplot(aes(x=Immunogenicity, y=`Thalf(h)`, fill=Dataset))+
geom_boxplot(alpha=0.3)+stat_compare_means(label = "p.signif",label.x.npc = "center")+
scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+ylab("Binding Stability\n(hrs)")+theme_pubr(base_size = 16)
STAB_PATHTESLA_ECDF_PLT=PathogenicSubset %>% select(`Thalf(h)`, Immunogenicity, Peptide) %>% mutate(Dataset="Pathogenic")%>%rbind(
TESLASubset %>% select(`Thalf(h)`, Immunogenicity,Peptide)%>% mutate(Dataset = "Cancer")
)%>% mutate(Dataset = paste0(Dataset,"_",Immunogenicity))%>% ggplot(aes(x=`Thalf(h)`, color=Dataset))+stat_ecdf(size=2)+scale_x_log10()+theme_pubr(base_size = 16)+grids()+ guides(color = guide_legend(nrow = 2))+ylab("Cumulative Freq. of Peptides")+xlab("Binding Stability\n(hrs)")
library(cowplot)
plot_grid(STAB_PATHTESLA_DENSITY_PLT, STAB_PATHTESLA_BOX_PLT+theme(legend.position = "none"),STAB_PATHTESLA_VIOLIN_PLT, STAB_PATHTESLA_ECDF_PLT,nrow=1,align="hv", rel_widths = c(1,0.75,1.05,1.2),axis="bl")

Fraction Hydrophobic
HYDRO_PATHTESLA_DENSITY_PLT= combinedDataset %>% select(Peptide,Immunogenicity,Dataset,HydroFraction) %>% distinct() %>% ggplot(aes(x=HydroFraction, fill=Dataset))+facet_wrap(~Immunogenicity)+geom_density(aes(y=..density..),alpha=0.4,position = "identity")+theme_pubr(base_size = 16)+xlab("Fraction Hydrophobic")
MEDIAN_DATA_TESLA_NEG = combinedDataset%>% select(Peptide,Immunogenicity,Dataset,HydroFraction) %>% distinct() %>% filter(Dataset == 'Cancer', Immunogenicity == "Negative") %>% dplyr::summarise(median = median(HydroFraction))
MEDIAN_DATA_TESLA_POS = combinedDataset %>% select(Peptide,Immunogenicity,Dataset,HydroFraction) %>% distinct()%>% filter(Dataset == 'Cancer', Immunogenicity == "Positive") %>% dplyr::summarise(median = median(HydroFraction))
HYDRO_PATHTESLA_VIOLIN_PLT=combinedDataset %>% select(Peptide,Immunogenicity,Dataset,HydroFraction) %>% distinct()%>% ggboxplot(x="Immunogenicity",y="HydroFraction")+theme_pubr(base_size = 16)+facet_grid(~Dataset)+stat_compare_means(label = "p.signif",label.x.npc = "center",comparisons = mycomparisons)+ylab("Fraction Hydrophobic")+ geom_hline(data=MEDIAN_DATA_TESLA_NEG,aes(yintercept=median), linetype="dashed", color = "red", size=0.5)+ geom_hline(data=MEDIAN_DATA_TESLA_POS,aes(yintercept=median), linetype="dashed", color = "green", size=0.5)
HYDRO_PATHTESLA_BOX_PLT=combinedDataset %>% select(Peptide,Immunogenicity,Dataset,HydroFraction) %>% distinct()%>% ggplot(aes(x=Immunogenicity, y=HydroFraction, fill=Dataset))+
geom_boxplot(alpha=0.3)+stat_compare_means(label = "p.signif",label.x.npc = "center")+ylab("Fraction Hydrophobic\n ")+theme_pubr(base_size = 16)
HYDRO_PATHTESLA_ECDF_PLT=PathogenicSubset %>% select(HydroFraction, Immunogenicity, Peptide) %>% mutate(Dataset="Pathogenic")%>% distinct()%>%rbind(
TESLASubset %>% select(HydroFraction, Immunogenicity,Peptide)%>% mutate(Dataset = "Cancer")%>% distinct()
)%>% mutate(Dataset = paste0(Dataset,"_",Immunogenicity))%>% ggplot(aes(x=HydroFraction, color=Dataset))+stat_ecdf(size=2)+theme_pubr(base_size = 16)+grids()+ guides(color = guide_legend(nrow = 2))+ylab("Cumulative Freq. of Peptides")+xlab("Fraction Hydrophobic")
library(cowplot)
plot_grid(HYDRO_PATHTESLA_DENSITY_PLT+rotate_x_text(angle=90), HYDRO_PATHTESLA_BOX_PLT+theme(legend.position = "none"),HYDRO_PATHTESLA_VIOLIN_PLT, HYDRO_PATHTESLA_ECDF_PLT,nrow=1,align="hv", rel_widths = c(1,0.75,1.05,1.2),axis="bl")
